################################################################################
#                  skforecast.model_selection._validation                      #
#                                                                              #
# This work by skforecast team is licensed under the BSD 3-Clause License.     #
################################################################################
# coding=utf-8

from __future__ import annotations
from typing_extensions import deprecated
from typing import Callable
from copy import deepcopy
from itertools import chain
import warnings
import numpy as np
import pandas as pd
from joblib import Parallel, delayed, cpu_count
from tqdm.auto import tqdm
from ..metrics import add_y_train_argument, _get_metric
from ..exceptions import LongTrainingWarning, IgnoredArgumentWarning, runtime_deprecated
from ..model_selection._split import TimeSeriesFold
from ..model_selection._utils import (
    _initialize_levels_model_selection_multiseries,
    check_backtesting_input,
    select_n_jobs_backtesting,
    _extract_data_folds_multiseries,
    _calculate_metrics_backtesting_multiseries
)
from ..utils import (
    check_preprocess_series,
    check_preprocess_exog_multiseries,
    set_skforecast_warnings
)


def _backtesting_forecaster(
    forecaster: object,
    y: pd.Series,
    metric: str | Callable | list[str | Callable],
    cv: TimeSeriesFold,
    exog: pd.Series | pd.DataFrame | None = None,
    interval: float | list[float] | tuple[float] | str | object | None = None,
    interval_method: str = 'bootstrapping',
    n_boot: int = 250,
    use_in_sample_residuals: bool = True,
    use_binned_residuals: bool = True,
    random_state: int = 123,
    return_predictors: bool = False,
    n_jobs: int | str = 'auto',
    verbose: bool = False,
    show_progress: bool = True
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Backtesting of forecaster model following the folds generated by the TimeSeriesFold
    class and using the metric(s) provided.

    If `forecaster` is already trained and `initial_train_size` is set to `None` in the
    TimeSeriesFold class, no initial train will be done and all data will be used
    to evaluate the model. However, the first `len(forecaster.last_window)` observations
    are needed to create the initial predictors, so no predictions are calculated for
    them.
    
    A copy of the original forecaster is created so that it is not modified during the
    process.
    
    Parameters
    ----------
    forecaster : ForecasterRecursive, ForecasterDirect, ForecasterEquivalentDate
        Forecaster model.
    y : pandas Series
        Training time series.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
        - If `string`: {'mean_squared_error', 'mean_absolute_error',
        'mean_absolute_percentage_error', 'mean_squared_log_error',
        'mean_absolute_scaled_error', 'root_mean_squared_scaled_error'}
        - If `Callable`: Function with arguments `y_true`, `y_pred` and `y_train`
        (Optional) that returns a float.
        - If `list`: List containing multiple strings and/or Callables.
    cv : TimeSeriesFold
        TimeSeriesFold object with the information needed to split the data into folds.
    exog : pandas Series, pandas DataFrame, default None
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
    interval : float, list, tuple, str, object, default None
        Specifies whether probabilistic predictions should be estimated and the 
        method to use. The following options are supported:

        - If `float`, represents the nominal (expected) coverage (between 0 and 1). 
        For instance, `interval=0.95` corresponds to `[2.5, 97.5]` percentiles.
        - If `list` or `tuple`: Sequence of percentiles to compute, each value must 
        be between 0 and 100 inclusive. For example, a 95% confidence interval can 
        be specified as `interval = [2.5, 97.5]` or multiple percentiles (e.g. 10, 
        50 and 90) as `interval = [10, 50, 90]`.
        - If 'bootstrapping' (str): `n_boot` bootstrapping predictions will be generated.
        - If scipy.stats distribution object, the distribution parameters will
        be estimated for each prediction.
        - If None, no probabilistic predictions are estimated.
    interval_method : str, default 'bootstrapping'
        Technique used to estimate prediction intervals. Available options:

        - 'bootstrapping': Bootstrapping is used to generate prediction 
        intervals [1]_.
        - 'conformal': Employs the conformal prediction split method for 
        interval estimation [2]_.
    n_boot : int, default 250
        Number of bootstrapping iterations to perform when estimating prediction
        intervals.
    use_in_sample_residuals : bool, default True
        If `True`, residuals from the training data are used as proxy of
        prediction error to create predictions. 
        If `False`, out of sample residuals (calibration) are used. 
        Out-of-sample residuals must be precomputed using Forecaster's
        `set_out_sample_residuals()` method.
    use_binned_residuals : bool, default True
        If `True`, residuals are selected based on the predicted values 
        (binned selection).
        If `False`, residuals are selected randomly.
    random_state : int, default 123
        Seed for the random number generator to ensure reproducibility.
    return_predictors : bool, default False
        If `True`, the predictors used to make the predictions are also returned.
    n_jobs : int, 'auto', default 'auto'
        The number of jobs to run in parallel. If `-1`, then the number of jobs is 
        set to the number of cores. If 'auto', `n_jobs` is set using the function
        skforecast.utils.select_n_jobs_backtesting.
    verbose : bool, default False
        Print number of folds and index of training and validation sets used 
        for backtesting.
    show_progress : bool, default True
        Whether to show a progress bar.

    Returns
    -------
    metric_values : pandas DataFrame
        Value(s) of the metric(s).
    backtest_predictions : pandas DataFrame
        Value of predictions. The  DataFrame includes the following columns:

        - fold: Indicates the fold number where the prediction was made.
        - pred: Predicted values for the corresponding series and time steps.

        If `interval` is not `None`, additional columns are included depending on the method:
        
        - For `float`: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` of 2 elements: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` with multiple percentiles: One column per percentile 
        (e.g., `p_10`, `p_50`, `p_90`).
        - For `'bootstrapping'`: One column per bootstrapping iteration 
        (e.g., `pred_boot_0`, `pred_boot_1`, ..., `pred_boot_n`).
        - For `scipy.stats` distribution objects: One column for each estimated 
        parameter of the distribution (e.g., `loc`, `scale`).

        If `return_predictors` is `True`, one column per predictor is created.

        Depending on the relation between `steps` and `fold_stride`, the output
        may include repeated indexes (if `fold_stride < steps`) or gaps
        (if `fold_stride > steps`). See Notes below for more details.

    Notes
    -----
    Note on `fold_stride` vs. `steps`:

    - If `fold_stride == steps`, test sets are placed back-to-back without overlap. 
    Each observation appears only once in the output DataFrame, so the index is unique.
    - If `fold_stride < steps`, test sets overlap. Multiple forecasts are generated 
    for the same observations and, therefore, the output DataFrame contains repeated 
    indexes.
    - If `fold_stride > steps`, there are gaps between consecutive test sets. 
    Some observations in the series will not have associated predictions, so 
    the output DataFrame has non-contiguous indexes.

    References
    ----------
    .. [1] Forecasting: Principles and Practice (3rd ed) Rob J Hyndman and George Athanasopoulos.
           https://otexts.com/fpp3/prediction-intervals.html
    
    .. [2] MAPIE - Model Agnostic Prediction Interval Estimator.
           https://mapie.readthedocs.io/en/stable/theoretical_description_regression.html#the-split-method
    
    """

    forecaster = deepcopy(forecaster)
    is_regression = forecaster.__skforecast_tags__['forecaster_task'] == 'regression'
    cv = deepcopy(cv)

    cv.set_params({
        'window_size': forecaster.window_size,
        'differentiation': forecaster.differentiation_max,
        'return_all_indexes': False,
        'verbose': verbose
    })

    refit = cv.refit
    overlapping_folds = cv.overlapping_folds

    if n_jobs == 'auto':
        n_jobs = select_n_jobs_backtesting(
                     forecaster = forecaster,
                     refit      = refit
                 )
    elif not isinstance(refit, bool) and refit != 1 and n_jobs != 1:
        warnings.warn(
            "If `refit` is an integer other than 1 (intermittent refit). `n_jobs` "
            "is set to 1 to avoid unexpected results during parallelization.",
            IgnoredArgumentWarning
        )
        n_jobs = 1
    else:
        n_jobs = n_jobs if n_jobs > 0 else cpu_count()

    if not isinstance(metric, list):
        metrics = [
            _get_metric(metric=metric)
            if isinstance(metric, str)
            else add_y_train_argument(metric)
        ]
    else:
        metrics = [
            _get_metric(metric=m)
            if isinstance(m, str)
            else add_y_train_argument(m) 
            for m in metric
        ]

    store_in_sample_residuals = True if use_in_sample_residuals else False
    if interval is None:
        forecaster._probabilistic_mode = False
    elif use_binned_residuals:
        forecaster._probabilistic_mode = 'binned'
    else:
        forecaster._probabilistic_mode = 'no_binned'

    folds = cv.split(X=y, as_pandas=False)
    initial_train_size = cv.initial_train_size
    window_size = cv.window_size
    gap = cv.gap

    if initial_train_size is not None:
        # NOTE: This allows for parallelization when `refit` is `False`. The initial 
        # Forecaster fit occurs outside of the auxiliary function.
        exog_train = exog.iloc[:initial_train_size, ] if exog is not None else None
        forecaster.fit(
            y                         = y.iloc[:initial_train_size, ],
            exog                      = exog_train,
            store_in_sample_residuals = store_in_sample_residuals
        )
        folds[0][5] = False

    if refit:
        n_of_fits = int(len(folds) / refit)
        if type(forecaster).__name__ != 'ForecasterDirect' and n_of_fits > 50:
            warnings.warn(
                f"The forecaster will be fit {n_of_fits} times. This can take substantial"
                f" amounts of time. If not feasible, try with `refit = False`.\n",
                LongTrainingWarning
            )
        elif type(forecaster).__name__ == 'ForecasterDirect' and n_of_fits * forecaster.max_step > 50:
            warnings.warn(
                f"The forecaster will be fit {n_of_fits * forecaster.max_step} times "
                f"({n_of_fits} folds * {forecaster.max_step} estimators). This can take "
                f"substantial amounts of time. If not feasible, try with `refit = False`.\n",
                LongTrainingWarning
            )

    if show_progress:
        folds = tqdm(folds)

    def _fit_predict_forecaster(
        fold, forecaster, y, exog, store_in_sample_residuals, gap, interval, 
        interval_method, n_boot, use_in_sample_residuals, use_binned_residuals, 
        random_state, return_predictors, is_regression
    ) -> pd.DataFrame:
        """
        Fit the forecaster and predict `steps` ahead. This is an auxiliary 
        function used to parallelize the backtesting_forecaster function.
        """

        train_iloc_start       = fold[1][0]
        train_iloc_end         = fold[1][1]
        last_window_iloc_start = fold[2][0]
        last_window_iloc_end   = fold[2][1]
        test_iloc_start        = fold[3][0]
        test_iloc_end          = fold[3][1]

        if fold[5] is False:
            # When the model is not fitted, last_window must be updated to include
            # the data needed to make predictions.
            last_window_y = y.iloc[last_window_iloc_start:last_window_iloc_end]
        else:
            # The model is fitted before making predictions. If `fixed_train_size`
            # the train size doesn't increase but moves by `steps` in each iteration.
            # If `False` the train size increases by `steps` in each iteration.
            y_train = y.iloc[train_iloc_start:train_iloc_end, ]
            exog_train = (
                exog.iloc[train_iloc_start:train_iloc_end,] if exog is not None else None
            )
            last_window_y = None
            forecaster.fit(
                y                         = y_train, 
                exog                      = exog_train, 
                store_in_sample_residuals = store_in_sample_residuals
            )

        next_window_exog = exog.iloc[test_iloc_start:test_iloc_end, ] if exog is not None else None

        steps = len(range(test_iloc_start, test_iloc_end))
        if type(forecaster).__name__ == 'ForecasterDirect' and gap > 0:
            # Select only the steps that need to be predicted if gap > 0
            test_no_gap_iloc_start = fold[4][0]
            test_no_gap_iloc_end   = fold[4][1]
            steps = list(
                np.arange(len(range(test_no_gap_iloc_start, test_no_gap_iloc_end)))
                + gap
                + 1
            )

        preds = []
        if is_regression:
            if interval is not None:
                kwargs_interval = {
                    'steps': steps,
                    'last_window': last_window_y,
                    'exog': next_window_exog,
                    'n_boot': n_boot,
                    'use_in_sample_residuals': use_in_sample_residuals,
                    'use_binned_residuals': use_binned_residuals,
                    'random_state': random_state
                }
                if interval_method == 'bootstrapping':
                    if interval == 'bootstrapping':
                        pred = forecaster.predict_bootstrapping(**kwargs_interval)
                    elif isinstance(interval, (list, tuple)):
                        quantiles = [q / 100 for q in interval]
                        pred = forecaster.predict_quantiles(quantiles=quantiles, **kwargs_interval)
                        if len(interval) == 2:
                            pred.columns = ['lower_bound', 'upper_bound']
                        else:
                            pred.columns = [f'p_{p}' for p in interval]
                    else:
                        pred = forecaster.predict_dist(distribution=interval, **kwargs_interval)
                    
                    preds.append(pred)
                else:
                    pred = forecaster.predict_interval(
                        method='conformal', interval=interval, **kwargs_interval
                    )
                    preds.append(pred)

            # NOTE: This is done after probabilistic predictions to avoid repeating 
            # the same checks.
            if interval is None or interval_method != 'conformal':
                pred = forecaster.predict(
                        steps        = steps,
                        last_window  = last_window_y,
                        exog         = next_window_exog,
                        check_inputs = True if interval is None else False
                    )
                preds.insert(0, pred)
        else:
            pred = forecaster.predict_proba(
                       steps       = steps,
                       last_window = last_window_y,
                       exog        = next_window_exog
                   )
            preds.append(pred)

        if return_predictors:
            pred = forecaster.create_predict_X(
                       steps        = steps,
                       last_window  = last_window_y,
                       exog         = next_window_exog,
                       check_inputs = False
                   )
            preds.append(pred)

        if len(preds) == 1:
            pred = preds[0]
        else:
            pred = pd.concat(preds, axis=1)

        if type(forecaster).__name__ != 'ForecasterDirect' and gap > 0:
            pred = pred.iloc[gap:, ]
        
        return pred

    kwargs_fit_predict_forecaster = {
        "forecaster": forecaster,
        "y": y,
        "exog": exog,
        "store_in_sample_residuals": store_in_sample_residuals,
        "gap": gap,
        "interval": interval,
        "interval_method": interval_method,
        "n_boot": n_boot,
        "use_in_sample_residuals": use_in_sample_residuals,
        "use_binned_residuals": use_binned_residuals,
        "random_state": random_state,
        "return_predictors": return_predictors,
        'is_regression': is_regression
    }
    backtest_predictions = Parallel(n_jobs=n_jobs)(
        delayed(_fit_predict_forecaster)(
            fold=fold, **kwargs_fit_predict_forecaster
        )
        for fold in folds
    )
    fold_labels = [
        np.repeat(fold[0], backtest_predictions[i].shape[0]) for i, fold in enumerate(folds)
    ]

    backtest_predictions = pd.concat(backtest_predictions)
    if isinstance(backtest_predictions, pd.Series):
        backtest_predictions = backtest_predictions.to_frame()

    if not is_regression:
        proba_cols = [f"{cls}_proba" for cls in forecaster.classes_]
        idx_max = backtest_predictions[proba_cols].to_numpy().argmax(axis=1)
        backtest_predictions.insert(0, "pred", np.array(forecaster.classes_)[idx_max])

    backtest_predictions.insert(0, 'fold', np.concatenate(fold_labels))

    train_indexes = []
    for i, fold in enumerate(folds):
        fit_fold = fold[-1]
        if i == 0 or fit_fold:
            # NOTE: When using a scaled metric, `y_train` doesn't include the
            # first window_size observations used to create the predictors and/or
            # rolling features.
            train_iloc_start = fold[1][0] + window_size
            train_iloc_end = fold[1][1]
            train_indexes.append(np.arange(train_iloc_start, train_iloc_end))
    
    train_indexes = np.unique(np.concatenate(train_indexes))
    y_train = y.iloc[train_indexes]

    backtest_predictions_for_metrics = backtest_predictions
    if overlapping_folds:
        backtest_predictions_for_metrics = (
            backtest_predictions_for_metrics
            .loc[~backtest_predictions_for_metrics.index.duplicated(keep='last')]
        )

    metric_values = [
        m(
            y_true = y.loc[backtest_predictions_for_metrics.index],
            y_pred = backtest_predictions_for_metrics['pred'],
            y_train = y_train
        ) 
        for m in metrics
    ]

    metric_values = pd.DataFrame(
        data    = [metric_values],
        columns = [m.__name__ for m in metrics]
    )
    
    return metric_values, backtest_predictions


def backtesting_forecaster(
    forecaster: object,
    y: pd.Series,
    cv: TimeSeriesFold,
    metric: str | Callable | list[str | Callable],
    exog: pd.Series | pd.DataFrame | None = None,
    interval: float | list[float] | tuple[float] | str | object | None = None,
    interval_method: str = 'bootstrapping',
    n_boot: int = 250,
    use_in_sample_residuals: bool = True,
    use_binned_residuals: bool = True,
    random_state: int = 123,
    return_predictors: bool = False,
    n_jobs: int | str = 'auto',
    verbose: bool = False,
    show_progress: bool = True
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Backtesting of forecaster model following the folds generated by the TimeSeriesFold
    class and using the metric(s) provided.

    If `forecaster` is already trained and `initial_train_size` is set to `None` in the
    TimeSeriesFold class, no initial train will be done and all data will be used
    to evaluate the model. However, the first `len(forecaster.last_window)` observations
    are needed to create the initial predictors, so no predictions are calculated for
    them.
    
    A copy of the original forecaster is created so that it is not modified during 
    the process.

    Parameters
    ----------
    forecaster : ForecasterRecursive, ForecasterDirect, ForecasterEquivalentDate
        Forecaster model.
    y : pandas Series
        Training time series.
    cv : TimeSeriesFold
        TimeSeriesFold object with the information needed to split the data into folds.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
        - If `string`: {'mean_squared_error', 'mean_absolute_error',
        'mean_absolute_percentage_error', 'mean_squared_log_error',
        'mean_absolute_scaled_error', 'root_mean_squared_scaled_error'}
        - If `Callable`: Function with arguments `y_true`, `y_pred` and `y_train`
        (Optional) that returns a float.
        - If `list`: List containing multiple strings and/or Callables.
    exog : pandas Series, pandas DataFrame, default None
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
    interval : float, list, tuple, str, object, default None
        Specifies whether probabilistic predictions should be estimated and the 
        method to use. The following options are supported:

        - If `float`, represents the nominal (expected) coverage (between 0 and 1). 
        For instance, `interval=0.95` corresponds to `[2.5, 97.5]` percentiles.
        - If `list` or `tuple`: Sequence of percentiles to compute, each value must 
        be between 0 and 100 inclusive. For example, a 95% confidence interval can 
        be specified as `interval = [2.5, 97.5]` or multiple percentiles (e.g. 10, 
        50 and 90) as `interval = [10, 50, 90]`.
        - If 'bootstrapping' (str): `n_boot` bootstrapping predictions will be generated.
        - If scipy.stats distribution object, the distribution parameters will
        be estimated for each prediction.
        - If None, no probabilistic predictions are estimated.
    interval_method : str, default 'bootstrapping'
        Technique used to estimate prediction intervals. Available options:

        - 'bootstrapping': Bootstrapping is used to generate prediction 
        intervals [1]_.
        - 'conformal': Employs the conformal prediction split method for 
        interval estimation [2]_.
    n_boot : int, default 250
        Number of bootstrapping iterations to perform when estimating prediction
        intervals.
    use_in_sample_residuals : bool, default True
        If `True`, residuals from the training data are used as proxy of
        prediction error to create predictions. 
        If `False`, out of sample residuals (calibration) are used. 
        Out-of-sample residuals must be precomputed using Forecaster's
        `set_out_sample_residuals()` method.
    use_binned_residuals : bool, default True
        If `True`, residuals are selected based on the predicted values 
        (binned selection).
        If `False`, residuals are selected randomly.
    random_state : int, default 123
        Seed for the random number generator to ensure reproducibility.
    return_predictors : bool, default False
        If `True`, the predictors used to make the predictions are also returned.
    n_jobs : int, 'auto', default 'auto'
        The number of jobs to run in parallel. If `-1`, then the number of jobs is 
        set to the number of cores. If 'auto', `n_jobs` is set using the function
        skforecast.utils.select_n_jobs_backtesting.
    verbose : bool, default False
        Print number of folds and index of training and validation sets used 
        for backtesting.
    show_progress : bool, default True
        Whether to show a progress bar.

    Returns
    -------
    metric_values : pandas DataFrame
        Value(s) of the metric(s).
    backtest_predictions : pandas DataFrame
        Value of predictions. The  DataFrame includes the following columns:

        - fold: Indicates the fold number where the prediction was made.
        - pred: Predicted values for the corresponding series and time steps.

        If `interval` is not `None`, additional columns are included depending on the method:
        
        - For `float`: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` of 2 elements: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` with multiple percentiles: One column per percentile 
        (e.g., `p_10`, `p_50`, `p_90`).
        - For `'bootstrapping'`: One column per bootstrapping iteration 
        (e.g., `pred_boot_0`, `pred_boot_1`, ..., `pred_boot_n`).
        - For `scipy.stats` distribution objects: One column for each estimated 
        parameter of the distribution (e.g., `loc`, `scale`).

        If `return_predictors` is `True`, one column per predictor is created.

        Depending on the relation between `steps` and `fold_stride`, the output
        may include repeated indexes (if `fold_stride < steps`) or gaps
        (if `fold_stride > steps`). See Notes below for more details.

    Notes
    -----
    Note on `fold_stride` vs. `steps`:

    - If `fold_stride == steps`, test sets are placed back-to-back without overlap. 
    Each observation appears only once in the output DataFrame, so the index is unique.
    - If `fold_stride < steps`, test sets overlap. Multiple forecasts are generated 
    for the same observations and, therefore, the output DataFrame contains repeated 
    indexes.
    - If `fold_stride > steps`, there are gaps between consecutive test sets. 
    Some observations in the series will not have associated predictions, so 
    the output DataFrame has non-contiguous indexes.

    References
    ----------
    .. [1] Forecasting: Principles and Practice (3rd ed) Rob J Hyndman and George Athanasopoulos.
           https://otexts.com/fpp3/prediction-intervals.html
    
    .. [2] MAPIE - Model Agnostic Prediction Interval Estimator.
           https://mapie.readthedocs.io/en/stable/theoretical_description_regression.html#the-split-method
    
    """

    forecaters_allowed = [
        'ForecasterRecursive', 
        'ForecasterDirect',
        'ForecasterEquivalentDate',
        'ForecasterRecursiveClassifier'
    ]
    
    if type(forecaster).__name__ not in forecaters_allowed:
        raise TypeError(
            f"`forecaster` must be of type {forecaters_allowed}, for all other types of "
            f" forecasters use the functions available in the other `model_selection` "
            f"modules."
        )
    
    check_backtesting_input(
        forecaster              = forecaster,
        cv                      = cv,
        y                       = y,
        metric                  = metric,
        interval                = interval,
        interval_method         = interval_method,
        n_boot                  = n_boot,
        use_in_sample_residuals = use_in_sample_residuals,
        use_binned_residuals    = use_binned_residuals,
        random_state            = random_state,
        return_predictors       = return_predictors,
        n_jobs                  = n_jobs,
        show_progress           = show_progress
    )
    
    metric_values, backtest_predictions = _backtesting_forecaster(
        forecaster              = forecaster,
        y                       = y,
        cv                      = cv,
        metric                  = metric,
        exog                    = exog,
        interval                = interval,
        interval_method         = interval_method,
        n_boot                  = n_boot,
        use_in_sample_residuals = use_in_sample_residuals,
        use_binned_residuals    = use_binned_residuals,
        random_state            = random_state,
        return_predictors       = return_predictors,
        n_jobs                  = n_jobs,
        verbose                 = verbose,
        show_progress           = show_progress
    )

    return metric_values, backtest_predictions


def _backtesting_forecaster_multiseries(
    forecaster: object,
    series: pd.DataFrame | dict[str, pd.Series | pd.DataFrame],
    cv: TimeSeriesFold,
    metric: str | Callable | list[str | Callable],
    levels: str | list[str] | None = None,
    add_aggregated_metric: bool = True,
    exog: pd.Series | pd.DataFrame | dict[str, pd.Series | pd.DataFrame] | None = None,
    interval: float | list[float] | tuple[float] | str | object | None = None,
    interval_method: str = 'conformal',
    n_boot: int = 250,
    use_in_sample_residuals: bool = True,
    use_binned_residuals: bool = True,
    random_state: int = 123,
    return_predictors: bool = False,
    n_jobs: int | str = 'auto',
    verbose: bool = False,
    show_progress: bool = True,
    suppress_warnings: bool = False
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Backtesting of forecaster model following the folds generated by the TimeSeriesFold
    class and using the metric(s) provided.

    If `forecaster` is already trained and `initial_train_size` is set to `None` in the
    TimeSeriesFold class, no initial train will be done and all data will be used
    to evaluate the model. However, the first `len(forecaster.last_window)` observations
    are needed to create the initial predictors, so no predictions are calculated for
    them.
    
    A copy of the original forecaster is created so that it is not modified during the
    process.
    
    Parameters
    ----------
    forecaster : ForecasterRecursiveMultiSeries, ForecasterDirectMultiVariate, ForecasterRnn
        Forecaster model.
    series : pandas DataFrame, dict
        Training time series.
    cv : TimeSeriesFold
        TimeSeriesFold object with the information needed to split the data into folds.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
        - If `string`: {'mean_squared_error', 'mean_absolute_error',
        'mean_absolute_percentage_error', 'mean_squared_log_error'}
        - If `Callable`: Function with arguments y_true, y_pred that returns a float.
        - If `list`: List containing multiple strings and/or Callables.
    levels : str, list, default None
        Time series to be predicted. If `None` all levels will be predicted.
    add_aggregated_metric : bool, default False
        If `True`, and multiple series (`levels`) are predicted, the aggregated
        metrics (average, weighted average and pooled) are also returned.

        - 'average': the average (arithmetic mean) of all levels.
        - 'weighted_average': the average of the metrics weighted by the number of
        predicted values of each level.
        - 'pooling': the values of all levels are pooled and then the metric is
        calculated.
    exog : pandas Series, pandas DataFrame, dict, default None
        Exogenous variables.
    interval : list, tuple, str, object, default None
        Specifies whether probabilistic predictions should be estimated and the 
        method to use. The following options are supported:

        - If `float`, represents the nominal (expected) coverage (between 0 and 1). 
        For instance, `interval=0.95` corresponds to `[2.5, 97.5]` percentiles.
        - If `list` or `tuple`: Sequence of percentiles to compute, each value must 
        be between 0 and 100 inclusive. For example, a 95% confidence interval can 
        be specified as `interval = [2.5, 97.5]` or multiple percentiles (e.g. 10, 
        50 and 90) as `interval = [10, 50, 90]`.
        - If 'bootstrapping' (str): `n_boot` bootstrapping predictions will be generated.
        - If scipy.stats distribution object, the distribution parameters will
        be estimated for each prediction.
        - If None, no probabilistic predictions are estimated.
    interval_method : str, default 'conformal'
        Technique used to estimate prediction intervals. Available options:

        - 'bootstrapping': Bootstrapping is used to generate prediction 
        intervals [1]_.
        - 'conformal': Employs the conformal prediction split method for 
        interval estimation [2]_.
    n_boot : int, default 250
        Number of bootstrapping iterations to perform when estimating prediction
        intervals.
    use_in_sample_residuals : bool, default True
        If `True`, residuals from the training data are used as proxy of
        prediction error to create predictions. 
        If `False`, out of sample residuals (calibration) are used. 
        Out-of-sample residuals must be precomputed using Forecaster's
        `set_out_sample_residuals()` method.
    use_binned_residuals : bool, default True
        If `True`, residuals are selected based on the predicted values 
        (binned selection).
        If `False`, residuals are selected randomly.
    random_state : int, default 123
        Seed for the random number generator to ensure reproducibility.
    return_predictors : bool, default False
        If `True`, the predictors used to make the predictions are also returned.
    n_jobs : int, 'auto', default 'auto'
        The number of jobs to run in parallel. If `-1`, then the number of jobs is 
        set to the number of cores. If 'auto', `n_jobs` is set using the function
        skforecast.utils.select_n_jobs_backtesting.
    verbose : bool, default False
        Print number of folds and index of training and validation sets used 
        for backtesting.
    show_progress : bool, default True
        Whether to show a progress bar.
    suppress_warnings: bool, default False
        If `True`, skforecast warnings will be suppressed during the backtesting 
        process. See skforecast.exceptions.warn_skforecast_categories for more
        information.

    Returns
    -------
    metrics_levels : pandas DataFrame
        Value(s) of the metric(s). Index are the levels and columns the metrics.
    backtest_predictions : pandas DataFrame
        Long-format DataFrame containing the predicted values for each series. The 
        DataFrame includes the following columns:
        
        - `level`: Identifier for the time series or level being predicted.
        - fold: Indicates the fold number where the prediction was made.
        - `pred`: Predicted values for the corresponding series and time steps.

        If `interval` is not `None`, additional columns are included depending on the method:
        
        - For `float`: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` of 2 elements: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` with multiple percentiles: One column per percentile 
        (e.g., `p_10`, `p_50`, `p_90`).
        - For `'bootstrapping'`: One column per bootstrapping iteration 
        (e.g., `pred_boot_0`, `pred_boot_1`, ..., `pred_boot_n`).
        - For `scipy.stats` distribution objects: One column for each estimated 
        parameter of the distribution (e.g., `loc`, `scale`).

        If `return_predictors` is `True`, one column per predictor is created.

        Depending on the relation between `steps` and `fold_stride`, the output
        may include repeated indexes (if `fold_stride < steps`) or gaps
        (if `fold_stride > steps`). See Notes below for more details.

    Notes
    -----
    Note on `fold_stride` vs. `steps`:

    - If `fold_stride == steps`, test sets are placed back-to-back without overlap. 
    Each observation appears only once in the output DataFrame, so the index is unique.
    - If `fold_stride < steps`, test sets overlap. Multiple forecasts are generated 
    for the same observations and, therefore, the output DataFrame contains repeated 
    indexes.
    - If `fold_stride > steps`, there are gaps between consecutive test sets. 
    Some observations in the series will not have associated predictions, so 
    the output DataFrame has non-contiguous indexes.

    References
    ----------
    .. [1] Forecasting: Principles and Practice (3rd ed) Rob J Hyndman and George Athanasopoulos.
           https://otexts.com/fpp3/prediction-intervals.html
    
    .. [2] MAPIE - Model Agnostic Prediction Interval Estimator.
           https://mapie.readthedocs.io/en/stable/theoretical_description_regression.html#the-split-method

    """

    set_skforecast_warnings(suppress_warnings, action='ignore')

    forecaster = deepcopy(forecaster)
    cv = deepcopy(cv)

    cv.set_params({
        'window_size': forecaster.window_size,
        'differentiation': forecaster.differentiation_max,
        'return_all_indexes': False,
        'verbose': verbose
    })

    refit = cv.refit
    overlapping_folds = cv.overlapping_folds

    if n_jobs == 'auto':
        n_jobs = select_n_jobs_backtesting(
                     forecaster = forecaster,
                     refit      = refit
                 )
    elif not isinstance(refit, bool) and refit != 1 and n_jobs != 1:
        warnings.warn(
            "If `refit` is an integer other than 1 (intermittent refit). `n_jobs` "
            "is set to 1 to avoid unexpected results during parallelization.",
            IgnoredArgumentWarning
        )
        n_jobs = 1
    else:
        n_jobs = n_jobs if n_jobs > 0 else cpu_count()

    levels = _initialize_levels_model_selection_multiseries(
                 forecaster = forecaster,
                 series     = series,
                 levels     = levels
             )

    if not isinstance(metric, list):
        metrics = [
            _get_metric(metric=metric)
            if isinstance(metric, str)
            else add_y_train_argument(metric)
        ]
    else:
        metrics = [
            _get_metric(metric=m)
            if isinstance(m, str)
            else add_y_train_argument(m) 
            for m in metric
        ]

    store_in_sample_residuals = True if use_in_sample_residuals else False
    if interval is None:
        forecaster._probabilistic_mode = False
    elif use_binned_residuals:
        forecaster._probabilistic_mode = 'binned'
    else:
        forecaster._probabilistic_mode = 'no_binned'

    folds = cv.split(X=series, as_pandas=False)
    span_index = cv._extract_index(X=series)
    initial_train_size = cv.initial_train_size
    gap = cv.gap

    if initial_train_size is not None:
        # NOTE: This allows for parallelization when `refit` is `False`. The initial 
        # Forecaster fit occurs outside of the auxiliary function.
        data_fold = _extract_data_folds_multiseries(
                        series             = series,
                        folds              = [folds[0]],
                        span_index         = span_index,
                        window_size        = forecaster.window_size,
                        exog               = exog,
                        dropna_last_window = forecaster.dropna_from_series,
                        externally_fitted  = False
                    )
        series_train, _, last_window_levels, exog_train, _, _ = next(data_fold)
        forecaster.fit(
            series                    = series_train,
            exog                      = exog_train,
            store_last_window         = last_window_levels,
            store_in_sample_residuals = store_in_sample_residuals,
            suppress_warnings         = suppress_warnings
        )
        folds[0][5] = False
        
    if refit:
        n_of_fits = int(len(folds) / refit)
        if type(forecaster).__name__ != 'ForecasterDirectMultiVariate' and n_of_fits > 50:
            warnings.warn(
                f"The forecaster will be fit {n_of_fits} times. This can take substantial "
                f"amounts of time. If not feasible, try with `refit = False`.\n",
                LongTrainingWarning,
            )
        elif type(forecaster).__name__ == 'ForecasterDirectMultiVariate' and n_of_fits * forecaster.max_step > 50:
            warnings.warn(
                f"The forecaster will be fit {n_of_fits * forecaster.max_step} times "
                f"({n_of_fits} folds * {forecaster.max_step} estimators). This can take "
                f"substantial amounts of time. If not feasible, try with `refit = False`.\n",
                LongTrainingWarning
            )

    if show_progress:
        folds = tqdm(folds)
        
    externally_fitted = True if initial_train_size is None else False
    data_folds = _extract_data_folds_multiseries(
                     series             = series,
                     folds              = folds,
                     span_index         = span_index,
                     window_size        = forecaster.window_size,
                     exog               = exog,
                     dropna_last_window = forecaster.dropna_from_series,
                     externally_fitted  = externally_fitted
                 )

    def _fit_predict_forecaster(
        data_fold, forecaster, store_in_sample_residuals, levels, gap, 
        interval, interval_method, n_boot, use_in_sample_residuals, 
        use_binned_residuals, random_state, return_predictors, suppress_warnings
    ) -> pd.DataFrame:
        """
        Fit the forecaster and predict `steps` ahead. This is an auxiliary 
        function used to parallelize the backtesting_forecaster_multiseries
        function.
        """

        (
            series_train,
            last_window_series,
            last_window_levels,
            exog_train,
            next_window_exog,
            fold
        ) = data_fold

        if fold[5] is True:
            forecaster.fit(
                series                    = series_train, 
                exog                      = exog_train,
                store_last_window         = last_window_levels,
                store_in_sample_residuals = store_in_sample_residuals,
                suppress_warnings         = suppress_warnings
            )

        test_iloc_start = fold[3][0]
        test_iloc_end   = fold[3][1]
        steps = len(range(test_iloc_start, test_iloc_end))
        if type(forecaster).__name__ == 'ForecasterDirectMultiVariate' and gap > 0:
            # Select only the steps that need to be predicted if gap > 0
            test_no_gap_iloc_start = fold[4][0]
            test_no_gap_iloc_end   = fold[4][1]
            steps = list(
                np.arange(len(range(test_no_gap_iloc_start, test_no_gap_iloc_end))) 
                + gap 
                + 1
            )

        preds = []
        levels_predict = [level for level in levels if level in last_window_levels]
        if interval is not None:
            kwargs_interval = {
                'steps': steps,
                'levels': levels_predict,
                'last_window': last_window_series,
                'exog': next_window_exog,
                'n_boot': n_boot,
                'use_in_sample_residuals': use_in_sample_residuals,
                'use_binned_residuals': use_binned_residuals,
                'random_state': random_state,
                'suppress_warnings': suppress_warnings
            }
            if interval_method == 'bootstrapping':
                if interval == 'bootstrapping':
                    pred = forecaster.predict_bootstrapping(**kwargs_interval)
                elif isinstance(interval, (list, tuple)):
                    quantiles = [q / 100 for q in interval]
                    pred = forecaster.predict_quantiles(quantiles=quantiles, **kwargs_interval)
                    if len(interval) == 2:
                        cols_names = ['level', 'lower_bound', 'upper_bound']
                    else:
                        cols_names = ['level'] + [f'p_{p}' for p in interval]  
                    pred.columns = cols_names
                else:
                    pred = forecaster.predict_dist(distribution=interval, **kwargs_interval)
                 
                # NOTE: Remove column 'level' as it already exists from predict()
                preds.append(pred.iloc[:, 1:])
            else:
                pred = forecaster.predict_interval(
                    method='conformal', interval=interval, **kwargs_interval
                )
                preds.append(pred)

        # NOTE: This is done after probabilistic predictions to avoid repeating 
        # the same checks.
        if interval is None or interval_method != 'conformal':
            pred = forecaster.predict(
                       steps             = steps, 
                       levels            = levels_predict, 
                       last_window       = last_window_series,
                       exog              = next_window_exog,
                       suppress_warnings = suppress_warnings,
                       check_inputs      = True if interval is None else False
                   )
            preds.insert(0, pred)

        if return_predictors:
            # NOTE: ForecasterRNN is not allowed for return_predictors as it 
            # returns two DataFrames, X_predict, exog_predict.
            # NOTE: Remove column 'level' as it already exists from predict()
            pred = forecaster.create_predict_X(
                       steps             = steps,
                       levels            = levels_predict, 
                       last_window       = last_window_series,
                       exog              = next_window_exog,
                       suppress_warnings = suppress_warnings,
                       check_inputs      = False
                   ).iloc[:, 1:]
            preds.append(pred)

        if len(preds) == 1:
            pred = preds[0]
        else:
            pred = pd.concat(preds, axis=1)

        # TODO: Check when long format with multiple levels in multivariate
        if type(forecaster).__name__ != 'ForecasterDirectMultiVariate' and gap > 0:
            pred = pred.iloc[len(levels_predict) * gap:, :]

        return pred, levels_predict

    kwargs_fit_predict_forecaster = {
        "forecaster": forecaster,
        "store_in_sample_residuals": store_in_sample_residuals,
        "levels": levels,
        "gap": gap,
        "interval": interval,
        "interval_method": interval_method,
        "n_boot": n_boot,
        "use_in_sample_residuals": use_in_sample_residuals,
        "use_binned_residuals": use_binned_residuals,
        "random_state": random_state,
        "return_predictors": return_predictors,
        "suppress_warnings": suppress_warnings
    }
    results = Parallel(n_jobs=n_jobs)(
        delayed(_fit_predict_forecaster)(
            data_fold=data_fold, **kwargs_fit_predict_forecaster
        )
        for data_fold in data_folds
    )

    backtest_predictions = [result[0] for result in results]
    fold_labels = [
        np.repeat(fold[0], backtest_predictions[i].shape[0]) for i, fold in enumerate(folds)
    ]
    backtest_predictions = pd.concat(backtest_predictions, axis=0)
    backtest_predictions.insert(0, 'fold', np.concatenate(fold_labels))
    backtest_levels = set(chain(*[result[1] for result in results]))
    
    backtest_predictions = (
        backtest_predictions
        .rename_axis('idx', axis=0)
        .set_index('level', append=True)
    )

    backtest_predictions_grouped = backtest_predictions.groupby('level', sort=False)
    for level, indices in backtest_predictions_grouped.groups.items():
        if level in backtest_levels:
            valid_index = series[level].dropna().index
            valid_index = pd.MultiIndex.from_product([valid_index, [level]], names=['idx', 'level'])
            no_valid_index = indices.difference(valid_index, sort=False)
            backtest_predictions.loc[no_valid_index, 'pred'] = np.nan

    backtest_predictions_for_metrics = backtest_predictions
    if overlapping_folds:
        backtest_predictions_for_metrics = (
            backtest_predictions_for_metrics
            .loc[~backtest_predictions_for_metrics.index.duplicated(keep='last')]
        )

    metrics_levels = _calculate_metrics_backtesting_multiseries(
        series                = series,
        predictions           = backtest_predictions_for_metrics[['pred']],
        folds                 = folds,
        span_index            = span_index,
        window_size           = forecaster.window_size,
        metrics               = metrics,
        levels                = levels,
        add_aggregated_metric = add_aggregated_metric
    )

    backtest_predictions = (
        backtest_predictions
        .reset_index('level')
        .rename_axis(None, axis=0)
    )
       
    set_skforecast_warnings(suppress_warnings, action='default')

    return metrics_levels, backtest_predictions


def backtesting_forecaster_multiseries(
    forecaster: object,
    series: pd.DataFrame | dict[str, pd.Series | pd.DataFrame],
    cv: TimeSeriesFold,
    metric: str | Callable | list[str | Callable],
    levels: str | list[str] | None = None,
    add_aggregated_metric: bool = True,
    exog: pd.Series | pd.DataFrame | dict[str, pd.Series | pd.DataFrame] | None = None,
    interval: float | list[float] | tuple[float] | str | object | None = None,
    interval_method: str = 'conformal',
    n_boot: int = 250,
    use_in_sample_residuals: bool = True,
    use_binned_residuals: bool = True,
    random_state: int = 123,
    return_predictors: bool = False,
    n_jobs: int | str = 'auto',
    verbose: bool = False,
    show_progress: bool = True,
    suppress_warnings: bool = False
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Backtesting of forecaster model following the folds generated by the TimeSeriesFold
    class and using the metric(s) provided.

    If `forecaster` is already trained and `initial_train_size` is set to `None` in the
    TimeSeriesFold class, no initial train will be done and all data will be used
    to evaluate the model. However, the first `len(forecaster.last_window)` observations
    are needed to create the initial predictors, so no predictions are calculated for
    them.
    
    A copy of the original forecaster is created so that it is not modified during 
    the process.

    Parameters
    ----------
    forecaster : ForecasterRecursiveMultiSeries, ForecasterDirectMultiVariate, ForecasterRnn
        Forecaster model.
    series : pandas DataFrame, dict
        Training time series.
    cv : TimeSeriesFold
        TimeSeriesFold object with the information needed to split the data into folds.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
        - If `string`: {'mean_squared_error', 'mean_absolute_error',
        'mean_absolute_percentage_error', 'mean_squared_log_error',
        'mean_absolute_scaled_error', 'root_mean_squared_scaled_error'}
        - If `Callable`: Function with arguments `y_true`, `y_pred` and `y_train`
        (Optional) that returns a float.
        - If `list`: List containing multiple strings and/or Callables.
    levels : str, list, default None
        Time series to be predicted. If `None` all levels will be predicted.
    add_aggregated_metric : bool, default True
        If `True`, and multiple series (`levels`) are predicted, the aggregated
        metrics (average, weighted average and pooled) are also returned.

        - 'average': the average (arithmetic mean) of all levels.
        - 'weighted_average': the average of the metrics weighted by the number of
        predicted values of each level.
        - 'pooling': the values of all levels are pooled and then the metric is
        calculated.
    exog : pandas Series, pandas DataFrame, dict, default None
        Exogenous variables.
    interval : float, list, tuple, str, object, default None
        Specifies whether probabilistic predictions should be estimated and the 
        method to use. The following options are supported:

        - If `float`, represents the nominal (expected) coverage (between 0 and 1). 
        For instance, `interval=0.95` corresponds to `[2.5, 97.5]` percentiles.
        - If `list` or `tuple`: Sequence of percentiles to compute, each value must 
        be between 0 and 100 inclusive. For example, a 95% confidence interval can 
        be specified as `interval = [2.5, 97.5]` or multiple percentiles (e.g. 10, 
        50 and 90) as `interval = [10, 50, 90]`.
        - If 'bootstrapping' (str): `n_boot` bootstrapping predictions will be generated.
        - If scipy.stats distribution object, the distribution parameters will
        be estimated for each prediction.
        - If None, no probabilistic predictions are estimated.
    interval_method : str, default 'conformal'
        Technique used to estimate prediction intervals. Available options:

        - 'bootstrapping': Bootstrapping is used to generate prediction 
        intervals [1]_.
        - 'conformal': Employs the conformal prediction split method for 
        interval estimation [2]_.
    n_boot : int, default 250
        Number of bootstrapping iterations to perform when estimating prediction 
        intervals.
    use_in_sample_residuals : bool, default True
        If `True`, residuals from the training data are used as proxy of
        prediction error to create predictions. 
        If `False`, out of sample residuals (calibration) are used. 
        Out-of-sample residuals must be precomputed using Forecaster's
        `set_out_sample_residuals()` method.
    use_binned_residuals : bool, default True
        If `True`, residuals are selected based on the predicted values 
        (binned selection).
        If `False`, residuals are selected randomly.
    random_state : int, default 123
        Seed for the random number generator to ensure reproducibility.
    return_predictors : bool, default False
        If `True`, the predictors used to make the predictions are also returned.
    n_jobs : int, 'auto', default 'auto'
        The number of jobs to run in parallel. If `-1`, then the number of jobs is 
        set to the number of cores. If 'auto', `n_jobs` is set using the function
        skforecast.utils.select_n_jobs_backtesting.
    verbose : bool, default False
        Print number of folds and index of training and validation sets used 
        for backtesting.
    show_progress : bool, default True
        Whether to show a progress bar.
    suppress_warnings: bool, default False
        If `True`, skforecast warnings will be suppressed during the backtesting 
        process. See skforecast.exceptions.warn_skforecast_categories for more
        information.

    Returns
    -------
    metrics_levels : pandas DataFrame
        Value(s) of the metric(s). Index are the levels and columns the metrics.
    backtest_predictions : pandas DataFrame
        Long-format DataFrame containing the predicted values for each series. The 
        DataFrame includes the following columns:
        
        - `level`: Identifier for the time series or level being predicted.
        - fold: Indicates the fold number where the prediction was made.
        - `pred`: Predicted values for the corresponding series and time steps.

        If `interval` is not `None`, additional columns are included depending on the method:
        
        - For `float`: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` of 2 elements: Columns `lower_bound` and `upper_bound`.
        - For `list` or `tuple` with multiple percentiles: One column per percentile 
        (e.g., `p_10`, `p_50`, `p_90`).
        - For `'bootstrapping'`: One column per bootstrapping iteration 
        (e.g., `pred_boot_0`, `pred_boot_1`, ..., `pred_boot_n`).
        - For `scipy.stats` distribution objects: One column for each estimated 
        parameter of the distribution (e.g., `loc`, `scale`).

        If `return_predictors` is `True`, one column per predictor is created.

        Depending on the relation between `steps` and `fold_stride`, the output
        may include repeated indexes (if `fold_stride < steps`) or gaps
        (if `fold_stride > steps`). See Notes below for more details.

    Notes
    -----
    Note on `fold_stride` vs. `steps`:

    - If `fold_stride == steps`, test sets are placed back-to-back without overlap. 
    Each observation appears only once in the output DataFrame, so the index is unique.
    - If `fold_stride < steps`, test sets overlap. Multiple forecasts are generated 
    for the same observations and, therefore, the output DataFrame contains repeated 
    indexes.
    - If `fold_stride > steps`, there are gaps between consecutive test sets. 
    Some observations in the series will not have associated predictions, so 
    the output DataFrame has non-contiguous indexes.

    References
    ----------
    .. [1] Forecasting: Principles and Practice (3rd ed) Rob J Hyndman and George Athanasopoulos.
           https://otexts.com/fpp3/prediction-intervals.html
    
    .. [2] MAPIE - Model Agnostic Prediction Interval Estimator.
           https://mapie.readthedocs.io/en/stable/theoretical_description_regression.html#the-split-method
    
    """

    multi_series_forecasters = [
        'ForecasterRecursiveMultiSeries', 
        'ForecasterDirectMultiVariate',
        'ForecasterRnn'
    ]

    forecaster_name = type(forecaster).__name__

    if forecaster_name not in multi_series_forecasters:
        raise TypeError(
            f"`forecaster` must be of type {multi_series_forecasters}, "
            f"for all other types of forecasters use the functions available in "
            f"the `model_selection` module. Got {forecaster_name}"
        )
    
    set_skforecast_warnings(suppress_warnings, action='ignore')
    
    if forecaster_name == 'ForecasterRecursiveMultiSeries':
        series, series_indexes = check_preprocess_series(series)
        if exog is not None:
            series_names_in_ = list(series.keys())
            exog_dict = {serie: None for serie in series_names_in_}
            exog, _ = check_preprocess_exog_multiseries(
                          series_names_in_  = series_names_in_,
                          series_index_type = type(series_indexes[series_names_in_[0]]),
                          exog              = exog,
                          exog_dict         = exog_dict
                      )
    
    set_skforecast_warnings(suppress_warnings, action='default')
    
    check_backtesting_input(
        forecaster              = forecaster,
        cv                      = cv,
        metric                  = metric,
        add_aggregated_metric   = add_aggregated_metric,
        series                  = series,
        exog                    = exog,
        interval                = interval,
        interval_method         = interval_method,
        n_boot                  = n_boot,
        use_in_sample_residuals = use_in_sample_residuals,
        use_binned_residuals    = use_binned_residuals,
        random_state            = random_state,
        return_predictors       = return_predictors,
        n_jobs                  = n_jobs,
        show_progress           = show_progress,
        suppress_warnings       = suppress_warnings
    )

    metrics_levels, backtest_predictions = _backtesting_forecaster_multiseries(
        forecaster              = forecaster,
        series                  = series,
        cv                      = cv,
        levels                  = levels,
        metric                  = metric,
        add_aggregated_metric   = add_aggregated_metric,
        exog                    = exog,
        interval                = interval,
        interval_method         = interval_method,
        n_boot                  = n_boot,
        use_in_sample_residuals = use_in_sample_residuals,
        use_binned_residuals    = use_binned_residuals,
        random_state            = random_state,
        return_predictors       = return_predictors,
        n_jobs                  = n_jobs,
        verbose                 = verbose,
        show_progress           = show_progress,
        suppress_warnings       = suppress_warnings
    )

    return metrics_levels, backtest_predictions

# TODO: Remove in version 0.20.0
@runtime_deprecated(replacement="_backtesting_stats", version="0.19.0", removal="0.20.0")
@deprecated("`_backtesting_sarimax` is deprecated since version 0.19.0; use `_backtesting_stats` instead. It will be removed in version 0.20.0.")
def _backtesting_sarimax(
    forecaster: object,
    y: pd.Series,
    metric: str | Callable | list[str | Callable],
    cv: TimeSeriesFold,
    exog: pd.Series | pd.DataFrame | None = None,
    alpha: float | None = None,
    interval: list[float] | tuple[float] | None = None,
    n_jobs: int | str = 'auto',
    suppress_warnings_fit: bool = False,
    verbose: bool = False,
    show_progress: bool = True,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    !!! warning "Deprecated"
        This function is deprecated since skforecast 0.19. Please use `_backtesting_stats` instead.

    """

    return _backtesting_stats(
        forecaster             = forecaster,
        y                      = y,
        metric                 = metric,
        cv                     = cv,
        exog                   = exog,
        alpha                  = alpha,
        interval               = interval,
        n_jobs                 = n_jobs,
        suppress_warnings_fit  = suppress_warnings_fit,
        verbose                = verbose,
        show_progress          = show_progress
    )

# TODO: Remove in version 0.20.0
@runtime_deprecated(replacement="backtesting_stats", version="0.19.0", removal="0.20.0")
@deprecated("`backtesting_sarimax` is deprecated since version 0.19.0; use `backtesting_stats` instead. It will be removed in version 0.20.0.")
def backtesting_sarimax(
    forecaster: object,
    y: pd.Series,
    cv: TimeSeriesFold,
    metric: str | Callable | list[str | Callable],
    exog: pd.Series | pd.DataFrame | None = None,
    alpha: float | None = None,
    interval: list[float] | tuple[float] | None = None,
    n_jobs: int | str = 'auto',
    verbose: bool = False,
    suppress_warnings_fit: bool = False,
    show_progress: bool = True
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    !!! warning "Deprecated"
        This function is deprecated since skforecast 0.19. Please use `backtesting_stats` instead.
    
    """

    return backtesting_stats(
        forecaster             = forecaster,
        y                      = y,
        cv                     = cv,
        metric                 = metric,
        exog                   = exog,
        alpha                  = alpha,
        interval               = interval,
        n_jobs                 = n_jobs,
        verbose                = verbose,
        suppress_warnings_fit  = suppress_warnings_fit,
        show_progress          = show_progress
    )
    

def _backtesting_stats(
    forecaster: object,
    y: pd.Series,
    metric: str | Callable | list[str | Callable],
    cv: TimeSeriesFold,
    exog: pd.Series | pd.DataFrame | None = None,
    alpha: float | None = None,
    interval: list[float] | tuple[float] | None = None,
    n_jobs: int | str = 'auto',
    suppress_warnings_fit: bool = False,
    verbose: bool = False,
    show_progress: bool = True,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Backtesting of ForecasterStats.
    
    A copy of the original forecaster is created so that it is not modified during 
    the process.
    
    Parameters
    ----------
    forecaster : ForecasterStats
        Forecaster model.
    y : pandas Series
        Training time series.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
        - If `string`: {'mean_squared_error', 'mean_absolute_error',
        'mean_absolute_percentage_error', 'mean_squared_log_error',
        'mean_absolute_scaled_error', 'root_mean_squared_scaled_error'}
        - If `Callable`: Function with arguments `y_true`, `y_pred` and `y_train`
        (Optional) that returns a float.
        - If `list`: List containing multiple strings and/or Callables.
    cv : TimeSeriesFold
        TimeSeriesFold object with the information needed to split the data into folds.
    exog : pandas Series, pandas DataFrame, default None
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
    alpha : float, default 0.05
        The confidence intervals for the forecasts are (1 - alpha) %.
        If both, `alpha` and `interval` are provided, `alpha` will be used.
    interval : list, tuple, default None
        Confidence of the prediction interval estimated. The values must be
        symmetric. Sequence of percentiles to compute, which must be between 
        0 and 100 inclusive. For example, interval of 95% should be as 
        `interval = [2.5, 97.5]`. If both, `alpha` and `interval` are 
        provided, `alpha` will be used.
    n_jobs : int, 'auto', default 'auto'
        The number of jobs to run in parallel. If `-1`, then the number of jobs is 
        set to the number of cores. If 'auto', `n_jobs` is set using the function
        skforecast.utils.select_n_jobs_backtesting.
    verbose : bool, default False
        Print number of folds and index of training and validation sets used 
        for backtesting.
    suppress_warnings_fit : bool, default False
        If `True`, warnings generated during fitting will be ignored.
    show_progress : bool, default True
        Whether to show a progress bar.

    Returns
    -------
    metric_values : pandas DataFrame
        Value(s) of the metric(s).
    backtest_predictions : pandas DataFrame
        Value of predictions. The  DataFrame includes the following columns:

        - fold: Indicates the fold number where the prediction was made.
        - pred: Predicted values for the corresponding series and time steps.

        If `interval` is not `None`, additional columns are included:

        - lower_bound: lower bound of the interval.
        - upper_bound: upper bound of the interval.

        Depending on the relation between `steps` and `fold_stride`, the output
        may include repeated indexes (if `fold_stride < steps`) or gaps
        (if `fold_stride > steps`). See Notes below for more details.

    Notes
    -----
    Note on `fold_stride` vs. `steps`:

    - If `fold_stride == steps`, test sets are placed back-to-back without overlap. 
    Each observation appears only once in the output DataFrame, so the index is unique.
    - If `fold_stride < steps`, test sets overlap. Multiple forecasts are generated 
    for the same observations and, therefore, the output DataFrame contains repeated 
    indexes.
    - If `fold_stride > steps`, there are gaps between consecutive test sets. 
    Some observations in the series will not have associated predictions, so 
    the output DataFrame has non-contiguous indexes.
    
    """

    forecaster = deepcopy(forecaster)
    cv = deepcopy(cv)

    estimator_type = f"{type(forecaster.estimator).__module__}.{type(forecaster.estimator).__name__}"
    if estimator_type != "skforecast.stats._sarimax.Sarimax" and cv.refit is False:
        warnings.warn(
            "If `ForecasterStats` uses a estimator different from "
            "`skforecast.stats.Sarimax`, `cv.refit` must be `True` since "
            "predictions must start from the end of the training set."
            " Setting `cv.refit = True`.",
            IgnoredArgumentWarning
        )
        cv.refit = True

    cv.set_params({
        'window_size': forecaster.window_size,
        'return_all_indexes': False,
        'verbose': verbose
    })

    refit = cv.refit
    overlapping_folds = cv.overlapping_folds
    
    if refit == False:
        if n_jobs != 'auto' and n_jobs != 1:
            warnings.warn(
                "If `refit = False`, `n_jobs` is set to 1 to avoid unexpected "
                "results during parallelization.",
                IgnoredArgumentWarning
            )
        n_jobs = 1
    else:
        if n_jobs == 'auto':        
            n_jobs = select_n_jobs_backtesting(
                         forecaster = forecaster,
                         refit      = refit
                     )
        elif not isinstance(refit, bool) and refit != 1 and n_jobs != 1:
            warnings.warn(
                "If `refit` is an integer other than 1 (intermittent refit). `n_jobs` "
                "is set to 1 to avoid unexpected results during parallelization.",
                IgnoredArgumentWarning
            )
            n_jobs = 1
        else:
            n_jobs = n_jobs if n_jobs > 0 else cpu_count()

    if not isinstance(metric, list):
        metrics = [
            _get_metric(metric=metric)
            if isinstance(metric, str)
            else add_y_train_argument(metric)
        ]
    else:
        metrics = [
            _get_metric(metric=m)
            if isinstance(m, str)
            else add_y_train_argument(m) 
            for m in metric
        ]
    
    folds = cv.split(X=y, as_pandas=False)
    initial_train_size = cv.initial_train_size
    steps = cv.steps
    gap = cv.gap

    # NOTE: initial_train_size cannot be None because of append method in Sarimax
    # NOTE: This allows for parallelization when `refit` is `False`. The initial 
    # Forecaster fit occurs outside of the auxiliary function.
    exog_train = exog.iloc[:initial_train_size, ] if exog is not None else None
    forecaster.fit(
        y                 = y.iloc[:initial_train_size, ],
        exog              = exog_train,
        suppress_warnings = suppress_warnings_fit
    )
    folds[0][5] = False
    
    if refit:
        n_of_fits = int(len(folds) / refit)
        if n_of_fits > 50:
            warnings.warn(
                f"The forecaster will be fit {n_of_fits} times. This can take substantial "
                f"amounts of time. If not feasible, try with `refit = False`.\n",
                LongTrainingWarning
            )
       
    folds_tqdm = tqdm(folds) if show_progress else folds

    def _fit_predict_forecaster(
        y, exog, forecaster, alpha, interval, fold, steps, gap
    ) -> pd.DataFrame:
        """
        Fit the forecaster and predict `steps` ahead. This is an auxiliary 
        function used to parallelize the backtesting_forecaster function.
        """

        # In each iteration the model is fitted before making predictions. 
        # if fixed_train_size the train size doesn't increase but moves by `steps` 
        # in each iteration. if False the train size increases by `steps` in each 
        # iteration.
        train_iloc_start = fold[1][0]
        train_iloc_end   = fold[1][1]
        test_iloc_start  = fold[3][0]
        test_iloc_end    = fold[3][1]

        if refit:
            last_window_iloc_start = fold[1][1]  # Same as train_iloc_end
            last_window_iloc_end   = fold[2][1]
        else:
            last_window_iloc_end   = fold[3][0]  # test_iloc_start
            last_window_iloc_start = last_window_iloc_end - steps

        if fold[5] is False:
            # When the model is not fitted, last_window and last_window_exog must 
            # be updated to include the data needed to make predictions.
            last_window_y = y.iloc[last_window_iloc_start:last_window_iloc_end]
            last_window_exog = exog.iloc[last_window_iloc_start:last_window_iloc_end] if exog is not None else None 
        else:
            # The model is fitted before making predictions. If `fixed_train_size`  
            # the train size doesn't increase but moves by `steps` in each iteration. 
            # If `False` the train size increases by `steps` in each  iteration.
            y_train = y.iloc[train_iloc_start:train_iloc_end, ]
            exog_train = exog.iloc[train_iloc_start:train_iloc_end, ] if exog is not None else None
            
            last_window_y = None
            last_window_exog = None

            forecaster.fit(y=y_train, exog=exog_train, suppress_warnings=suppress_warnings_fit)

        next_window_exog = exog.iloc[test_iloc_start:test_iloc_end, ] if exog is not None else None

        # After the first fit, Sarimax must use the last windows stored in the model
        if fold == folds[0]:
            last_window_y = None
            last_window_exog = None

        steps = len(range(test_iloc_start, test_iloc_end))
        if alpha is None and interval is None:
            pred = forecaster.predict(
                       steps            = steps,
                       last_window      = last_window_y,
                       last_window_exog = last_window_exog,
                       exog             = next_window_exog
                   )
        else:
            pred = forecaster.predict_interval(
                       steps            = steps,
                       exog             = next_window_exog,
                       alpha            = alpha,
                       interval         = interval,
                       last_window      = last_window_y,
                       last_window_exog = last_window_exog
                   )

        pred = pred.iloc[gap:, ]            
        
        return pred

    backtest_predictions = Parallel(n_jobs=n_jobs)(
        delayed(_fit_predict_forecaster)(
            y=y,
            exog=exog,
            forecaster=forecaster,
            alpha=alpha,
            interval=interval,
            fold=fold,
            steps=steps,
            gap=gap,
        )
        for fold in folds_tqdm
    )
    fold_labels = [
        np.repeat(fold[0], backtest_predictions[i].shape[0]) for i, fold in enumerate(folds)
    ]
    
    backtest_predictions = pd.concat(backtest_predictions)
    if isinstance(backtest_predictions, pd.Series):
        backtest_predictions = pd.DataFrame(backtest_predictions)
    backtest_predictions.insert(0, 'fold', np.concatenate(fold_labels))

    train_indexes = []
    for i, fold in enumerate(folds):
        fit_fold = fold[-1]
        if i == 0 or fit_fold:
            train_iloc_start = fold[1][0]
            train_iloc_end = fold[1][1]
            train_indexes.append(np.arange(train_iloc_start, train_iloc_end))
    
    train_indexes = np.unique(np.concatenate(train_indexes))
    y_train = y.iloc[train_indexes]

    backtest_predictions_for_metrics = backtest_predictions
    if overlapping_folds:
        backtest_predictions_for_metrics = (
            backtest_predictions_for_metrics
            .loc[~backtest_predictions_for_metrics.index.duplicated(keep='last')]
        )

    metric_values = [
        m(
            y_true = y.loc[backtest_predictions_for_metrics.index],
            y_pred = backtest_predictions_for_metrics['pred'],
            y_train = y_train
        ) 
        for m in metrics
    ]

    metric_values = pd.DataFrame(
        data    = [metric_values],
        columns = [m.__name__ for m in metrics]
    )

    return metric_values, backtest_predictions


def backtesting_stats(
    forecaster: object,
    y: pd.Series,
    cv: TimeSeriesFold,
    metric: str | Callable | list[str | Callable],
    exog: pd.Series | pd.DataFrame | None = None,
    alpha: float | None = None,
    interval: list[float] | tuple[float] | None = None,
    n_jobs: int | str = 'auto',
    verbose: bool = False,
    suppress_warnings_fit: bool = False,
    show_progress: bool = True
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Backtesting of ForecasterStats.
    
    A copy of the original forecaster is created so that it is not modified during 
    the process.

    Parameters
    ----------
    forecaster : ForecasterStats
        Forecaster model.
    y : pandas Series
        Training time series.
    cv : TimeSeriesFold
        TimeSeriesFold object with the information needed to split the data into folds.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
        - If `string`: {'mean_squared_error', 'mean_absolute_error',
        'mean_absolute_percentage_error', 'mean_squared_log_error',
        'mean_absolute_scaled_error', 'root_mean_squared_scaled_error'}
        - If `Callable`: Function with arguments `y_true`, `y_pred` and `y_train`
        (Optional) that returns a float.
        - If `list`: List containing multiple strings and/or Callables.
    exog : pandas Series, pandas DataFrame, default None
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
    alpha : float, default 0.05
        The confidence intervals for the forecasts are (1 - alpha) %.
        If both, `alpha` and `interval` are provided, `alpha` will be used.
    interval : list, tuple, default None
        Confidence of the prediction interval estimated. The values must be
        symmetric. Sequence of percentiles to compute, which must be between 
        0 and 100 inclusive. For example, interval of 95% should be as 
        `interval = [2.5, 97.5]`. If both, `alpha` and `interval` are 
        provided, `alpha` will be used.
    n_jobs : int, 'auto', default 'auto'
        The number of jobs to run in parallel. If `-1`, then the number of jobs is 
        set to the number of cores. If 'auto', `n_jobs` is set using the function
        skforecast.utils.select_n_jobs_backtesting. 
    verbose : bool, default False
        Print number of folds and index of training and validation sets used 
        for backtesting.
    suppress_warnings_fit : bool, default False
        If `True`, warnings generated during fitting will be ignored.
    show_progress : bool, default True
        Whether to show a progress bar.

    Returns
    -------
    metric_values : pandas DataFrame
        Value(s) of the metric(s).
    backtest_predictions : pandas DataFrame
        Value of predictions. The  DataFrame includes the following columns:

        - fold: Indicates the fold number where the prediction was made.
        - pred: Predicted values for the corresponding series and time steps.

        If `interval` is not `None`, additional columns are included:
        
        - lower_bound: lower bound of the interval.
        - upper_bound: upper bound of the interval.

        Depending on the relation between `steps` and `fold_stride`, the output
        may include repeated indexes (if `fold_stride < steps`) or gaps
        (if `fold_stride > steps`). See Notes below for more details.

    Notes
    -----
    Note on `fold_stride` vs. `steps`:

    - If `fold_stride == steps`, test sets are placed back-to-back without overlap. 
    Each observation appears only once in the output DataFrame, so the index is unique.
    - If `fold_stride < steps`, test sets overlap. Multiple forecasts are generated 
    for the same observations and, therefore, the output DataFrame contains repeated 
    indexes.
    - If `fold_stride > steps`, there are gaps between consecutive test sets. 
    Some observations in the series will not have associated predictions, so 
    the output DataFrame has non-contiguous indexes.
    
    """
    
    if type(forecaster).__name__ not in ['ForecasterStats']:
        raise TypeError(
            "`forecaster` must be of type `ForecasterStats`, for all other "
            "types of forecasters use the functions available in the other "
            "`model_selection` modules."
        )
    
    check_backtesting_input(
        forecaster            = forecaster,
        cv                    = cv,
        y                     = y,
        metric                = metric,
        interval              = interval,
        alpha                 = alpha,
        n_jobs                = n_jobs,
        show_progress         = show_progress,
        suppress_warnings_fit = suppress_warnings_fit
    )
    
    metric_values, backtest_predictions = _backtesting_stats(
        forecaster            = forecaster,
        y                     = y,
        cv                    = cv,
        metric                = metric,
        exog                  = exog,
        alpha                 = alpha,
        interval              = interval,
        n_jobs                = n_jobs,
        verbose               = verbose,
        suppress_warnings_fit = suppress_warnings_fit,
        show_progress         = show_progress
    )

    return metric_values, backtest_predictions
